pacman::p_load(tidyverse, data.table,broom,readxl,lfe,modelsummary,kableExtra,
               latex2exp,ggpubr,ggpmisc)
options("modelsummary_format_numeric_latex" = "plain")
rm(list=ls())
`%!in%` = Negate(`%in%`)
################################################################################

#Spatial units
spatial_id_a= 'county_id' 
spatial_id_b= 'amc_id'

df = fread(paste0('../../data/landuse/clean/geographicunits_argentina.csv.gz'))[, list(country, region, state, state_id, county_id)]
df$spatial_id = df$county_id
df_a = df[,list(county_id,spatial_id,state_id)]
df=fread(paste0('../../data/landuse/clean/geographicunits_brazil.csv.gz'))[, list(country, region, state, state_id, mesoregion_id, microregion_id, amc_id, county_id)]
setnames(df, old=spatial_id_b, new='spatial_id')
df_b = unique(df[,list(county_id,spatial_id,state_id)], by = c("county_id","spatial_id","state_id"))
df_u_cw = unique(rbind(df_a,df_b), by=c("county_id","spatial_id","state_id"))

df = fread(paste0('../../data/agribusiness/clean/distance_to_hub_all.csv.gz'))
df = merge(df,df_u_cw,by='county_id')
df_1 = df[, list(spatial_id, commodity, ma_type, product_type, ma)][, lapply(.SD, mean, na.rm=TRUE), by=list(spatial_id, commodity, ma_type, product_type)]
df_2 = df[, list(spatial_id, commodity, ma_type, product_type, size_q_i)][, lapply(.SD, sum, na.rm=TRUE), by=list(spatial_id, commodity, ma_type, product_type)]
df_ma = merge(df_1,df_2, by=c('spatial_id', 'commodity', 'ma_type', 'product_type'))


##################### Land shares

for (country in c('argentina','brazil') ){
  
  df_l = fread(paste0('../../data/landuse/clean/landuse_',country,'_county.csv.gz'))
  df_u = fread(paste0('../../data/landuse/clean/geographicunits_',country,'.csv.gz'))
  crop_list = c('soybean','maize')
  
  if (country=='argentina'){
    
    #Spatial unit crosswalk
    df_u = df_u[, list(county_id, state_id, region, land_area_km2)]
    
    #Land
    df = merge(df_l, df_u, by='county_id')
    df = df[, c('forest','pasture') := list(forest_natural+forest_planted,pasture + forage_seasonal+forage_permanent)]
    setnames(df, old = c(spatial_id_a),  new = c('spatial_id'))
    df = df[,  list(year, region, state_id, spatial_id, pasture, forest, soybean, maize)][, lapply(.SD, sum, na.rm=TRUE), by=list(year, region, state_id, spatial_id)][, unit:='hectares']
    setnames(df, old=c('soybean','maize','pasture'), new=c('soybean_ha','maize_ha','pasture_ha'))
    df[, c('agriculture_ha','nature_ha'):=list(soybean_ha+maize_ha+pasture_ha,forest)]
    df[, c('soybean','maize','pasture','share_agriculture'):=list(soybean_ha/agriculture_ha, maize_ha/agriculture_ha, pasture_ha/agriculture_ha, agriculture_ha/(agriculture_ha+nature_ha) )]
    df = df[,list(year, country, region, state_id, spatial_id, soybean, maize, pasture, share_agriculture)][is.finite(share_agriculture),]
    df_commodity = melt(df[,list(year, spatial_id, soybean, maize, pasture)], id.vars=c("year","spatial_id"), variable.name="commodity", value.name="share_commodity")
    df_nests = df[,list(year, country, region, state_id, spatial_id, share_agriculture)]
    df_land = merge(df_nests,df_commodity, by=c('year','spatial_id'))
        
    #Farm-gate commodity prices
    df = fread(paste0('../../data/prices/clean/farmgate_crops_argentina_',spatial_id_a,'.csv.gz'))
    df=df[year!=2018,]
    df[year==2017,]$year=2018
    df_pc = df[commodity %in% crop_list, list(year, region, state_id, spatial_id, commodity, price)][, lapply(.SD, mean, na.rm=TRUE), by=list(year, region, state_id, spatial_id, commodity)][, unit:='current USD per tonne']
    df = fread(paste0('../../data/prices/clean/farmgate_cattle_argentina_',spatial_id_a,'.csv.gz'))[, commodity:='pasture']
    setnames(df, old = c('value'),  new = c('price'))
    df_pp = df[establishment_type %in% c('all establishments') & value_type %in% c('price_per_tn'), list(year, region, state_id, spatial_id, commodity, price)][, lapply(.SD, mean, na.rm=TRUE), by=list(year, region, state_id, spatial_id, commodity)][, unit:='current USD per tonne']
    df_prices = rbind(df_pc, df_pp)
    
    df_ar = merge(df_land, df_prices[, list(year,spatial_id,commodity,price)], by=c('year','spatial_id','commodity'), all.x=T)
    df_ar$country = 'ARGENTINA'
    
  }else{
    
    #Spatial unit crosswalk
    df_u = df_u[, list(county_id, amc_id, microregion_id, mesoregion_id, state_id, region, land_area_km2)]
    
    #Land
    df = merge(df_l, df_u, by='county_id')
    df = df[, c('pasture','forest') := list(pasture_natural+pasture_planted,forest_natural+forest_planted)]
    setnames(df, old = c(spatial_id_b),  new = c('spatial_id'))
    df = df[,  list(year, region, state_id, spatial_id, pasture, forest, soybean, maize)][, lapply(.SD, sum, na.rm=TRUE), by=list(year, region, state_id, spatial_id)][, unit:='hectares']
    setnames(df, old=c('soybean','maize','pasture'), new=c('soybean_ha','maize_ha','pasture_ha'))
    df[, c('agriculture_ha','nature_ha'):=list(soybean_ha+maize_ha+pasture_ha,forest)]
    df[, c('soybean','maize','pasture','share_agriculture'):=list(soybean_ha/agriculture_ha, maize_ha/agriculture_ha, pasture_ha/agriculture_ha, agriculture_ha/(agriculture_ha+nature_ha) )]
    df = df[,list(year, country, region, state_id, spatial_id, soybean, maize, pasture, share_agriculture)][is.finite(share_agriculture),]
    df_commodity = melt(df[,list(year, spatial_id, soybean, maize, pasture)], id.vars=c("year","spatial_id"), variable.name="commodity", value.name="share_commodity")
    df_nests = df[,list(year, country, region, state_id, spatial_id, share_agriculture)]
    df_land = merge(df_nests,df_commodity, by=c('year','spatial_id'))
    
    #Farm-gate commodity prices
    df = fread(paste0('../../data/prices/clean/farmgate_crops_brazil_',spatial_id_b,'.csv.gz'))
    df_pc = df[commodity %in% crop_list, list(year, region, state_id, spatial_id, commodity, price)][, lapply(.SD, mean, na.rm=TRUE), by=list(year, region, state_id, spatial_id, commodity)][, unit:='current USD per tonne']
    df = fread(paste0('../../data/prices/clean/farmgate_cattle_brazil_',spatial_id_b,'.csv.gz'))[, commodity:='pasture']
    setnames(df, old = c('value'),  new = c('price'))
    df_pp = df[establishment_type %in% c('cattle establishments') & value_type %in% c('breeders_p','calves_p'), list(year, region, state_id, spatial_id, commodity, price)][, lapply(.SD, mean, na.rm=TRUE), by=list(year, region, state_id, spatial_id, commodity)][, unit:='current USD per head']
    beef_tn_per_head = 0.25
    df_pp$price = df_pp$price/beef_tn_per_head
    df_prices = rbind(df_pc, df_pp)
    
    df_br = merge(df_land, df_prices[, list(year,spatial_id,commodity,price)], by=c('year','spatial_id','commodity'), all.x=T)
    df_br$country = 'BRAZIL'
    
  }
}

df_land = rbind(df_ar,df_br)
setnames(df_land, old=c('price'), new=c('price_i'))


##################### Output elasticities

df = fread('inputs/parameters_supply.csv')[,list(spatial_id, theta_lambda, theta, frontier)]
df_params = df[, lapply(.SD, mean, na.rm=TRUE), by=list(spatial_id)]
df_factors = setDT(read_excel('../../data/factors/raw/factor_shares.xlsx'))[,c('commodity','land_share')]
df = merge(df_land,df_params,  by=c('spatial_id'))
df = merge(df,df_factors, by=c('commodity'))
df[, c('elasticity_QP_i'):=list(  ( (theta_lambda-1)*(1-share_commodity) + (theta-1)*share_commodity*(1-share_agriculture)+(1-land_share) )/land_share ) ]
df[, c('elasticity_PQ_i'):=list( 1/elasticity_QP_i ) ]
df[commodity=='pasture',]$commodity = 'beef'
df_elasticity = df


##################### Intermediaries

exporter_type = c('exporter_group')
destination_type_a = 'port'
destination_type_b = 'hub'
id_type_a = c('county_known')
id_type_b = c('county_known')
year_max = 2018

for (country in c('argentina','brazil') ){
  
  if (country=='argentina'){
    
    destination_type = destination_type_a
    df = fread(paste0('../../data/agribusiness/clean/soybean_',country,'.csv.gz'))[id_type %in% id_type_a & state %!in% c('UNKNOWN','IMPORTS + STOCK') & destination_bloc != 'UNKNOWN',]
    setnames(df, old = c(spatial_id_a,exporter_type,destination_type),  new = c('spatial_id','firm','destination_unit'))
    
    df_n_ij = df[, list(year, spatial_id, destination_unit, commodity, product_type, firm)][ , .(N_ij = length(unique(firm))), by=list(year, spatial_id, destination_unit, commodity, product_type)]
    df_n_i = df[, list(year, spatial_id, commodity, product_type, firm)][ , .(N_i = length(unique(firm))), by=list(year, spatial_id, commodity, product_type)]
    df_n = merge(df_n_ij,df_n_i, by=c('year','spatial_id','commodity','product_type'), all.x=T)
    
    df_q_ij = df[, list(year, country, region, state_id, spatial_id, destination_unit, equivalent_tonnes, fob_usd, commodity, product_type)][, lapply(.SD, sum, na.rm=TRUE), by=list(year, country, region, state_id, spatial_id, destination_unit, commodity, product_type)]
    df_q_i = df[, list(year, country, region, state_id, spatial_id, equivalent_tonnes, fob_usd, commodity, product_type)][, lapply(.SD, sum, na.rm=TRUE), by=list(year, country, region, state_id, spatial_id, commodity, product_type)]
    df_s = merge(df_q_ij,df_q_i, by=c('year','country','region','state_id','spatial_id','commodity','product_type'))[, c('share_ij_tonnes','share_ij_usd','price_ij','q_ij','value_ij') := list(equivalent_tonnes.x/equivalent_tonnes.y,fob_usd.x/fob_usd.y,fob_usd.x/equivalent_tonnes.x, equivalent_tonnes.x,fob_usd.x)][, list(year, country, region, state_id, spatial_id, destination_unit, share_ij_tonnes, share_ij_usd, price_ij, q_ij, value_ij, commodity, product_type)]
    
    df_a_t = merge(df_s,df_n, by=c('year','spatial_id','destination_unit','commodity','product_type'))
    df_a = df_a_t[year<=year_max,][, c("year") := NULL][, lapply(.SD, mean, na.rm=TRUE),  by=list(country, region, state_id, spatial_id, destination_unit, commodity, product_type)]
    
  }else{
    
    destination_type = destination_type_b
    n_c=1
    for (commodity in c('soybean','beef','maize')) {
      
      df = fread(paste0('../../data/agribusiness/clean/',commodity,'_',country,'.csv.gz'))[id_type %in% id_type_b & state !='UNKNOWN STATE'  & destination != 'BRAZIL' & destination_bloc != 'UNKNOWN',]
      if(commodity=='beef'){df$port = df$hub}
      setnames(df, old = c(spatial_id_b,exporter_type,destination_type),  new = c('spatial_id','firm','destination_unit'))
      
      df_n_ij = df[, list(year, spatial_id, destination_unit, commodity, product_type, firm)][ , .(N_ij = length(unique(firm))), by=list(year, spatial_id, destination_unit, commodity, product_type)]
      df_n_i = df[, list(year, spatial_id, commodity, product_type, firm)][ , .(N_i = length(unique(firm))), by=list(year, spatial_id, commodity, product_type)]
      df_n = merge(df_n_ij,df_n_i, by=c('year','spatial_id','commodity','product_type'), all.x=T)
      
      df_q_ij = df[, list(year, country, region, state_id, spatial_id, destination_unit, equivalent_tonnes, fob_usd, commodity, product_type)][, lapply(.SD, sum, na.rm=TRUE), by=list(year, country, region, state_id, spatial_id, destination_unit, commodity, product_type)]
      df_q_i = df[, list(year, country, region, state_id, spatial_id,equivalent_tonnes,fob_usd, commodity, product_type)][, lapply(.SD, sum, na.rm=TRUE), by=list(year, country, region, state_id, spatial_id, commodity, product_type)]
      df_s = merge(df_q_ij,df_q_i, by=c('year','country','region','state_id','spatial_id','commodity','product_type'))[, c('share_ij_tonnes','share_ij_usd','price_ij','q_ij','value_ij') := list(equivalent_tonnes.x/equivalent_tonnes.y,fob_usd.x/fob_usd.y,fob_usd.x/equivalent_tonnes.x, equivalent_tonnes.x,fob_usd.x)][, list(year, country, region, state_id, spatial_id, destination_unit, share_ij_tonnes, share_ij_usd, price_ij, q_ij, value_ij, commodity, product_type)]

      df_b_t_c = merge(df_s,df_n, by=c('year','spatial_id','destination_unit','commodity', 'product_type'))
      df_b_c = df_b_t_c[year<=year_max,][, c("year") := NULL][, lapply(.SD, mean, na.rm=TRUE),  by=list(country, region, state_id, spatial_id, destination_unit, commodity, product_type)]
      
      if(n_c==1){
        df_b_t = df_b_t_c
        df_b = df_b_c
      }else{
        df_b_t = rbind(df_b_t,df_b_t_c)
        df_b = rbind(df_b,df_b_c)
      }
      n_c=n_c+1
    } 
  }
}

df_shares = rbind(df_a_t,df_b_t)


##################### Price gaps and markdowns

#Markdowns
df = merge(df_shares, df_elasticity, by=c('commodity','year','country','region','state_id','spatial_id'))
df=df[price_ij>0 & price_i>0,]
df$mu_i = (1+df$elasticity_PQ_i/df$N_i)^(-1)
df[N_i==0,]$mu_i=NA
df$N_i = as.numeric(df$N_i)

#Price gap
df$price_ratio_ij = df$price_ij/df$price_i
max_var = quantile(df$price_ratio_ij,0.975)
min_var = quantile(df$price_ratio_ij,0.025)
df = df[price_ratio_ij>min_var & price_ratio_ij<max_var,]
df$price_ratio_ij = df$price_ratio_ij + (1-min(df$price_ratio_ij))

df_prices = df


##################### Distance

#Assign hubs to county/AMC
df_ar_s = fread(paste0("../../data/agribusiness/clean/ports_matched_soybean_argentina.csv"), header=T)[,list(port, county_id)] #manually produced
setnames(df_ar_s, old=c('port','county_id'), new=c('destination_unit','destination_unit_spatial_id'))
df_ar_s = unique(df_ar_s, by=c("destination_unit"))

df_br_b = fread(paste0("../../data/agribusiness/clean/hubs_matched_beef_brazil.csv"), header=T)[,list(hub, amc_id)] #manually produced
setnames(df_br_b, old=c('hub','amc_id'), new=c('destination_unit','destination_unit_spatial_id'))
df_br_b = unique(df_br_b, by=c("destination_unit"))

df_br_s = fread(paste0("../../data/agribusiness/clean/hubs_matched_soybean_brazil.csv"), header=T)[,list(hub, amc_id)] #manually produced
setnames(df_br_s, old=c('hub','amc_id'), new=c('destination_unit','destination_unit_spatial_id'))
df_br_s = unique(df_br_s, by=c("destination_unit"))

df_br_m = fread(paste0("../../data/agribusiness/clean/hubs_matched_maize_brazil.csv"), header=T)[,list(hub, amc_id)] #manually produced
setnames(df_br_m, old=c('hub','amc_id'), new=c('destination_unit','destination_unit_spatial_id'))
df_br_m = unique(df_br_m, by=c("destination_unit"))

df_destination_unit_id = unique(rbind(df_ar_s,df_br_b,df_br_s,df_br_m),by=c("destination_unit"))
df_prices_id = merge(df_prices,df_destination_unit_id,by=c("destination_unit"))

#Argentina
df = fread(paste0("../../data/agribusiness/clean/soybean_argentina_county_port_distance.csv.gz"))
setnames(df, old=c('county_id','county_id_port'), new=c('spatial_id','destination_unit_spatial_id'))
df_ar_d = df[,list(spatial_id,destination_unit_spatial_id,distance_km)][, lapply(.SD, median), by=.(spatial_id, destination_unit_spatial_id)]

#Brazil 
df = fread('../../data/landuse/clean/geographicunits_brazil.csv.gz')
df_cw = unique(df[,list(county_id,amc_id)], by=c('county_id','amc_id'))
setnames(df_cw, old=c('county_id','amc_id'), new=c('county_id_hub','destination_unit_spatial_id'))

for(commodity in c('beef','soybean','maize')){
  
  df = fread(paste0("../../data/agribusiness/clean/",commodity,"_brazil_county_hub_distance.csv.gz"))
  df = merge(df,df_cw, by=c('county_id_hub'))
  df$spatial_id = df$amc_id
  df = df[,list(spatial_id,destination_unit_spatial_id,distance_km)][, lapply(.SD, median), by=.(spatial_id,destination_unit_spatial_id)]
  if(commodity=='beef'){df_br_b_d=df}
  if(commodity=='soybean'){df_br_s_d=df}
  if(commodity=='maize'){df_br_m_d=df}
  
}
  
df_br_d = rbind(df_br_b_d,df_br_s_d,df_br_m_d)
df_br_d = df_br_d[, lapply(.SD, median), by=.(spatial_id,destination_unit_spatial_id)]

#All
df_d=rbind(df_ar_d,df_br_d)
df_main = merge(df_prices_id, df_d, by=c('spatial_id','destination_unit_spatial_id'))
df_main = merge(df_main,df_ma[ma_type=='all_hubs_weighted',], all.x=T, by=c('spatial_id', 'commodity', 'product_type'))


##################### Regressions

df = df_main
df$y = -log(df$price_ratio_ij)
df$x1 = log(df$mu_i)
df$x2 = log(df$distance_km)

df$spatial_fe = df$spatial_id 
df = df[is.finite(x1) & is.finite(x2) & is.finite(y),]
df$year = as.factor(df$year)
df_e = unique(df[,c('commodity','product_type')])
df_e$ck = paste0(df_e$commodity,'_',df_e$product_type)

df = merge(df, df_e, by=c('commodity','product_type'))

mp_0a = felm(y ~ x1 | 0 | 0 | spatial_id ,  data=df)
mp_0c = felm(y ~ x1  + x2 | 0 | 0 | spatial_id ,  data=df)

mp_1a = felm(y ~ x1 | commodity | 0 | spatial_id ,  data=df)
mp_1c = felm(y ~ x1  + x2 | commodity | 0 | spatial_id ,  data=df)

mp_2a = felm(y ~ x1 | ck | 0 | spatial_id ,  data=df)
mp_2c = felm(y ~ x1  + x2| ck | 0 | spatial_id ,  data=df)

#Table
rows <- tribble(~term,                                        ~OLS, ~OLS, ~OLS,
                'Commodity fixed effects'                   , ' ',  '$\\checkmark$', ' ',
                'Commodity-by-product fixed effects'        , ' ',  ' ',  '$\\checkmark$')
gm <- tibble::tribble(
  ~raw,        ~clean,          ~fmt,
  "nobs",      "Observations",  0)
attr(rows, 'position') <- c(5,6)
table_short_final = msummary(list(mp_0c,mp_1c,mp_2c),
                             title =  '\\label{tab: estimation - markdowns - pricegap}External markdowns vs. model-implied markdowns.',
                             coef_map =  c('x1' ='log (model-implied markdown) ', 'x2' ='log (distance to hub)'),
                             coef_omit = 'Intercept',
                             estimate = "{estimate}{stars}",
                             gof_map = gm, 
                             escape=F,
                             add_rows = rows,
                             output='latex') %>% add_header_above(c(" " = 1, "Dep. var.: log (external markdown)" = 3)) 

table_short_final = table_short_final %>% kable_styling(latex_options = "HOLD_position")
kableExtra::save_kable(table_short_final, file="../../output/tables/appendix/table6_markdownvalidation.tex")

